# pacotes ---------------
library(tidyverse)
library(GGally)
# Exemplo de conexão com um MariaDB (também conhecido como MySQL) remoto.
con_mariadb <- DBI::dbConnect(
RMariaDB::MariaDB(),
host = "relational.fit.cvut.cz",
port = 3306,
username = "guest",
password = "relational",
dbname = "financial"
)
DBI::dbListTables(con_mariadb)
[1] "account" "card" "client" "disp" "district" "loan" "order" "trans"
# Conexao com SQLite --------------------------------------------------------------
con <- DBI::dbConnect(RSQLite::SQLite(), "dados_consultoria.db")
# Acessando o SQL a partir do R ------------------------------------------------
# computação é feita lá no servidor.
tbl(con, "indicadores") %>%
count(id)
# dá pra consultar a query de SQL que rodou lá no servidor.
tbl(con, "indicadores") %>%
count(id) %>%
show_query()
<SQL>
SELECT `id`, COUNT(*) AS `n`
FROM `indicadores`
GROUP BY `id`
# passando do SQL pro R --------------------------------------------------------
# collect() faz o download pro computado local (não estamos mais usando o servidor).
indicadores <- tbl(con, "indicadores") %>% collect()
# PS: o SQLite não tem formato de datas, então tem que transformar em data quando vem pro R. Esse problema não tem no MySQL ou no SQL Server.
indicadores <- indicadores %>%
mutate(
data = as.Date(data)
)
Informação 1 - Sobre os IDs, o cliente informou que deveria ter apenas uma linha para cada trinca (id-ano-mes). Por conta de uma inconsistência, poderia acontecer de virem duas ou mais linhas para o mesma trinca (id-ano-mes). O correto é ter apenas uma linha apenas. Eles disseram que a linha com o maior valor de agendamento tem mais chance de ser a correta.
# Solução: arrange() + distinct()
indicadores <- indicadores %>%
dplyr::arrange(desc(agendamento)) %>%
dplyr::distinct(id, ano, mes, .keep_all = TRUE)
Informação 2 - Sobre as séries mensais, o cliente informou que:
# Olhando o problema dos meses faltantes
indicadores %>%
ggplot(aes(x = data, y = id, colour = id)) +
geom_point(size = 5)
# Solução: {padr} + {tidyr} (exemplo com o id 970)
indicadores %>%
dplyr::filter(id == 970) %>%
dplyr::arrange(data) %>%
padr::pad(interval = "month", group = "id")
indicadores_com_pad <- indicadores %>%
padr::pad(interval = "month", group = "id")
# padr::pad() consertou
indicadores_com_pad %>%
ggplot(aes(x = data, y = id, colour = id)) +
geom_point(size = 5)
# agora tem que preencher os NAs com fill.
indicadores_com_pad <- indicadores_com_pad %>%
dplyr::arrange(id, data) %>%
tidyr::fill(agendamento:cidade) %>%
dplyr::mutate(
# mes e ano não dá pra preencher com fill diretamente
mes = as.character(lubridate::month(data)),
ano = as.character(lubridate::year(data))
)
# features no tempo (exemplo) ---------------------
teste <- indicadores_com_pad %>%
filter(id %in% c(406, 420), data %>% between(as.Date("2019-01-01"), as.Date("2019-04-01"))) %>%
select(id, data)
# transformacoes
teste %>%
arrange(id, data) %>%
group_by(id) %>%
mutate(
x = 1:n(),
a = cumsum(x),
b = lag(x),
c = lag(x, n = 2),
d = lead(x),
e = slider::slide_dbl(x, mean, .before = 1, .after = 0),
f = slider::slide_dbl(x, mean, .before = 0, .after = 0),
h = x/lag(x)
)
# https://github.com/gastonstat/CreditScoring
# http://bit.ly/2kkBFrk
library(modeldata)
data(credit_data)
glimpse(credit_data) # German Risk
Rows: 4,454
Columns: 14
$ Status <fct> good, good, bad, good, good, good, good, good, good, bad, good, good, good, good, bad, good, good, good,~
$ Seniority <int> 9, 17, 10, 0, 0, 1, 29, 9, 0, 0, 6, 7, 8, 19, 0, 0, 15, 33, 0, 1, 2, 5, 1, 27, 26, 12, 19, 15, 3, 0, 4, ~
$ Home <fct> rent, rent, owner, rent, rent, owner, owner, parents, owner, parents, owner, owner, owner, priv, other, ~
$ Time <int> 60, 60, 36, 60, 36, 60, 60, 12, 60, 48, 48, 36, 60, 36, 18, 24, 24, 24, 48, 60, 60, 60, 60, 60, 60, 36, ~
$ Age <int> 30, 58, 46, 24, 26, 36, 44, 27, 32, 41, 34, 29, 30, 37, 21, 68, 52, 68, 36, 31, 25, 22, 45, 41, 51, 54, ~
$ Marital <fct> married, widow, married, single, single, married, married, single, married, married, married, married, m~
$ Records <fct> no, no, yes, no, no, no, no, no, no, no, no, no, no, no, yes, no, no, no, no, no, no, no, no, no, no, no~
$ Job <fct> freelance, fixed, freelance, fixed, fixed, fixed, fixed, fixed, freelance, partime, freelance, fixed, fi~
$ Expenses <int> 73, 48, 90, 63, 46, 75, 75, 35, 90, 90, 60, 60, 75, 75, 35, 75, 35, 65, 45, 35, 46, 45, 105, 74, 45, 60,~
$ Income <int> 129, 131, 200, 182, 107, 214, 125, 80, 107, 80, 125, 121, 199, 170, 50, 131, 330, 200, 130, 137, 107, 32~
$ Assets <int> 0, 0, 3000, 2500, 0, 3500, 10000, 0, 15000, 0, 4000, 3000, 5000, 3500, 0, 4162, 16500, 5000, 750, 0, 0, ~
$ Debt <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2500, 260, 0, 0, 0, 2000, 0, 0, 0, 0, 500, 0, 0, 0, 0, 0, 0, NA, 330~
$ Amount <int> 800, 1000, 2000, 900, 310, 650, 1600, 200, 1200, 1200, 1150, 650, 1500, 600, 400, 900, 1500, 600, 1100, ~
$ Price <int> 846, 1658, 2985, 1325, 910, 1645, 1800, 1093, 1957, 1468, 1577, 915, 1650, 940, 500, 1186, 2201, 1350, 1~
credit_data %>% count(Status)
# trazer do servidor para o R (memória do computador local)
credit_data <- credit_data %>% collect()
set.seed(1)
credit_initial_split <- initial_split(credit_data, strata = "Status", prop = 0.75)
credit_train <- training(credit_initial_split)
credit_test <- testing(credit_initial_split)
skimr::skim(credit_train)
-- Data Summary ------------------------
Values
Name credit_train
Number of rows 3340
Number of columns 14
_______________________
Column type frequency:
factor 5
numeric 9
________________________
Group variables None
-- Variable type: factor ---------------------------------------------------------------------------------------------------
# A tibble: 5 x 6
skim_variable n_missing complete_rate ordered n_unique top_counts
* <chr> <int> <dbl> <lgl> <int> <chr>
1 Status 0 1 FALSE 2 goo: 2400, bad: 940
2 Home 5 0.999 FALSE 6 own: 1604, ren: 728, par: 572, oth: 236
3 Marital 1 1.00 FALSE 5 mar: 2443, sin: 714, sep: 102, wid: 50
4 Records 0 1 FALSE 2 no: 2756, yes: 584
5 Job 2 0.999 FALSE 4 fix: 2113, fre: 754, par: 339, oth: 132
-- Variable type: numeric --------------------------------------------------------------------------------------------------
# A tibble: 9 x 11
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
* <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 Seniority 0 1 8.05 8.24 0 2 5 12 47 ▇▃▁▁▁
2 Time 0 1 46.1 14.8 6 36 48 60 60 ▁▂▅▃▇
3 Age 0 1 37.3 11.0 18 28 36 45 68 ▆▇▆▃▁
4 Expenses 0 1 55.6 19.4 35 35 51 72 180 ▇▃▁▁▁
5 Income 285 0.915 141. 80.8 6 90 125 170 959 ▇▂▁▁▁
6 Assets 35 0.990 5611. 12523. 0 0 3100 6000 300000 ▇▁▁▁▁
7 Debt 14 0.996 336. 1259. 0 0 0 0 30000 ▇▁▁▁▁
8 Amount 0 1 1024. 468. 100 700 1000 1300 5000 ▇▆▁▁▁
9 Price 0 1 1448. 619. 105 1104 1398 1675. 11140 ▇▁▁▁▁
visdat::vis_miss(credit_train)
credit_train %>%
select(where(is.numeric)) %>%
cor(use = "p") %>%
corrplot::corrplot()
credit_train %>%
select(where(is.numeric), Status) %>%
ggpairs(aes(colour = Status))
contagens %>%
ggplot(aes(y = valor, x = n, fill = Status)) +
geom_col(position = "fill") +
geom_label(aes(label = n), position = position_fill(vjust = 0.5)) +
facet_wrap(~variavel, scales = "free_y") +
ggtitle("Status vs. Variáveis Categóricas")
credit_train %>%
select(c(where(is.numeric), Status)) %>%
pivot_longer(-Status, names_to = "variavel", values_to = "valor") %>%
ggplot(aes(y = Status, x = valor, fill = Status)) +
geom_boxplot() +
facet_wrap(~variavel, scales = "free_x") +
ggtitle("Status vs. Variáveis Numéricas")
credit_train %>%
select(c(where(is.numeric), Status)) %>%
pivot_longer(-Status, names_to = "variavel", values_to = "valor") %>%
ggplot(aes(y = Status, x = valor, fill = Status)) +
geom_boxplot() +
facet_wrap(~variavel, scales = "free_x") +
scale_x_log10() +
ggtitle("Status vs. Variáveis Numéricas NA ESCALA LOG")
credit_train %>%
select(c(where(is.numeric), Status)) %>%
pivot_longer(-Status, names_to = "variavel", values_to = "valor") %>%
ggplot(aes(x = valor, colour = Status)) +
stat_ecdf() +
facet_wrap(~variavel, scales = "free_x") +
labs(title = "Status vs. Variáveis Numéricas",
subtitle = "Distribuição Acumulada")
credit_recipe
Data Recipe
Inputs:
Operations:
Zero variance filter on all_predictors()
Variable mutation for Home, Job, Marital, Assets, Income
Bagged tree imputation for Debt
Centering and scaling for all_numeric()
Novel factor level assignment for all_nominal_predictors()
# criando a base preparada
credit_preparada <- bake(prep(credit_recipe), new_data = NULL)
# olhando a base preparada
visdat::vis_miss(credit_preparada)
credit_preparada %>%
select(c(where(is.factor), Status)) %>%
pivot_longer(-Status, names_to = "variavel", values_to = "valor") %>%
count(Status, variavel, valor) %>%
ggplot(aes(y = valor, x = n, fill = Status)) +
geom_col(position = "fill") +
geom_label(aes(label = n), position = position_fill(vjust = 0.5)) +
facet_wrap(~variavel, scales = "free_y", ncol = 3) +
ggtitle("Status vs. Variáveis Categóricas")
grafico_de_barras_das_vars_continuas(credit_preparada)
# finalizando a receita com dummies
credit_recipe <- credit_recipe %>%
step_dummy(all_nominal_predictors())
# Definição de
# a) a f(x): logistc_reg()
# b) modo (natureza da var resp): classification
# c) hiperparametros que queremos tunar: penalty = tune()
# d) hiperparametros que não queremos tunar: mixture = 1 # LASSO
# e) o motor que queremos usar: glmnet
credit_lr_model <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
# workflow
credit_wf <- workflow() %>% add_model(credit_lr_model) %>% add_recipe(credit_recipe)
# a) bases de reamostragem para validação: vfold_cv()
# b) (opcional) grade de parâmetros: parameters() %>% update() %>% grid_regular()
# c) tune_grid(y ~ x + ...)
# d) escolha das métricas (rmse, roc_auc, etc)
# d) collect_metrics() ou autoplot() para ver o resultado
credit_resamples <- vfold_cv(credit_train, v = 5)
credit_lr_tune_grid <- tune_grid(
credit_wf,
resamples = credit_resamples,
grid = 40,
metrics = metric_set(
accuracy,
roc_auc
# kap, # KAPPA
# precision,
# recall,
# f_meas,
# mn_log_loss #binary cross entropy
)
)
autoplot(credit_lr_tune_grid)
show_best(credit_lr_tune_grid)
collect_metrics(credit_lr_tune_grid)
# a) extrai melhor modelo com select_best()
# b) finaliza o modelo inicial com finalize_model()
# c) ajusta o modelo final com todos os dados de treino (bases de validação já era)
credit_lr_best_params <- select_best(credit_lr_tune_grid, "roc_auc")
credit_wf <- credit_wf %>% finalize_workflow(credit_lr_best_params)
credit_lr_last_fit <- last_fit(
credit_wf,
credit_initial_split
)
credit_lr_last_fit
# Resampling results
# Manual resampling